# Importing necessary libraries
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(usmap)
## Warning: package 'usmap' was built under R version 4.1.2
library(mlr3)
## Warning: package 'mlr3' was built under R version 4.1.2
library(mlr3learners)
## Warning: package 'mlr3learners' was built under R version 4.1.2
library(mlr3pipelines)
## Warning: package 'mlr3pipelines' was built under R version 4.1.2
library(mlr3tuning)
## Loading required package: paradox
## Warning: package 'paradox' was built under R version 4.1.2
library(paradox)
# Reading the dataset
airports <- read.csv("airports.csv")
carriers <- read.csv("carriers.csv")
plane <- read.csv("plane-data.csv", na.strings = "")
flight_1 <- read.csv("2003.csv")
flight_2 <- read.csv("2004.csv")
# Row binding the flights in 2003 and 2004
flight <- rbind(flight_1,flight_2)
# Get a statistical summary for the binded flight data
summary(flight)
## Year Month DayofMonth DayOfWeek
## Min. :2003 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2003 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2004 Median : 7.000 Median :16.00 Median :4.000
## Mean :2004 Mean : 6.538 Mean :15.74 Mean :3.937
## 3rd Qu.:2004 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2004 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime CRSDepTime ArrTime CRSArrTime
## Min. : 1 Min. : 0 Min. : 1 Min. : 0
## 1st Qu.: 938 1st Qu.: 935 1st Qu.:1123 1st Qu.:1128
## Median :1330 Median :1330 Median :1524 Median :1527
## Mean :1346 Mean :1341 Mean :1500 Mean :1507
## 3rd Qu.:1732 3rd Qu.:1725 3rd Qu.:1915 3rd Qu.:1911
## Max. :2750 Max. :2400 Max. :2955 Max. :2400
## NA's :229226 NA's :254391
## UniqueCarrier FlightNum TailNum ActualElapsedTime
## Length:13617810 Min. : 1 Length:13617810 Min. :-710.0
## Class :character 1st Qu.: 587 Class :character 1st Qu.: 72.0
## Mode :character Median :1408 Mode :character Median : 104.0
## Mean :2054 Mean : 122.9
## 3rd Qu.:2887 3rd Qu.: 154.0
## Max. :9912 Max. :1777.0
## NA's :254392
## CRSElapsedTime AirTime ArrDelay DepDelay
## Min. : -85.0 Min. :-3818.0 Min. :-1302.00 Min. :-1410.00
## 1st Qu.: 74.0 1st Qu.: 53.0 1st Qu.: -10.00 1st Qu.: -4.00
## Median : 105.0 Median : 85.0 Median : -2.00 Median : 0.00
## Mean : 124.1 Mean : 103.3 Mean : 5.12 Mean : 6.63
## 3rd Qu.: 155.0 3rd Qu.: 137.0 3rd Qu.: 9.00 3rd Qu.: 4.00
## Max. :1441.0 Max. : 3508.0 Max. : 1879.00 Max. : 1882.00
## NA's :1 NA's :254391 NA's :254392 NA's :229226
## Origin Dest Distance TaxiIn
## Length:13617810 Length:13617810 Min. : 6.0 Min. : 0.000
## Class :character Class :character 1st Qu.: 304.0 1st Qu.: 4.000
## Mode :character Mode :character Median : 547.0 Median : 5.000
## Mean : 712.6 Mean : 7.469
## 3rd Qu.: 944.0 3rd Qu.: 7.000
## Max. :4962.0 Max. :1495.000
##
## TaxiOut Cancelled CancellationCode Diverted
## Min. : 0.00 Min. :0.00000 Length:13617810 Min. :0.000000
## 1st Qu.: 10.00 1st Qu.:0.00000 Class :character 1st Qu.:0.000000
## Median : 13.00 Median :0.00000 Mode :character Median :0.000000
## Mean : 15.42 Mean :0.01683 Mean :0.001848
## 3rd Qu.: 18.00 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :3905.00 Max. :1.00000 Max. :1.000000
##
## CarrierDelay WeatherDelay NASDelay SecurityDelay
## Min. : 0.0 Min. : 0.0 Min. : -60.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0
## Median : 0.0 Median : 0.0 Median : 0.0 Median : 0
## Mean : 2.5 Mean : 0.6 Mean : 3.3 Mean : 0
## 3rd Qu.: 0.0 3rd Qu.: 0.0 3rd Qu.: 0.0 3rd Qu.: 0
## Max. :1879.0 Max. :1230.0 Max. :1385.0 Max. :533
## NA's :2672742 NA's :2672742 NA's :2672742 NA's :2672742
## LateAircraftDelay
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 0.0
## Mean : 3.2
## 3rd Qu.: 0.0
## Max. :1407.0
## NA's :2672742
# Cleaning NAs
flight <- flight %>%
drop_na(DepTime, ArrTime, ArrDelay)
summary(flight)
## Year Month DayofMonth DayOfWeek DepTime
## Min. :2003 Min. : 1.000 Min. : 1.00 Min. :1.00 Min. : 1
## 1st Qu.:2003 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.00 1st Qu.: 938
## Median :2004 Median : 7.000 Median :16.00 Median :4.00 Median :1330
## Mean :2004 Mean : 6.541 Mean :15.75 Mean :3.94 Mean :1346
## 3rd Qu.:2004 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:6.00 3rd Qu.:1732
## Max. :2004 Max. :12.000 Max. :31.00 Max. :7.00 Max. :2750
##
## CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum
## Min. : 0 Min. : 1 Min. : 0 Length:13363418 Min. : 1
## 1st Qu.: 935 1st Qu.:1123 1st Qu.:1128 Class :character 1st Qu.: 585
## Median :1328 Median :1524 Median :1526 Mode :character Median :1403
## Mean :1340 Mean :1500 Mean :1506 Mean :2042
## 3rd Qu.:1725 3rd Qu.:1915 3rd Qu.:1910 3rd Qu.:2857
## Max. :2400 Max. :2955 Max. :2400 Max. :9912
##
## TailNum ActualElapsedTime CRSElapsedTime AirTime
## Length:13363418 Min. :-710.0 Min. : -32.0 Min. :-3818.0
## Class :character 1st Qu.: 72.0 1st Qu.: 75.0 1st Qu.: 53.0
## Mode :character Median : 104.0 Median : 105.0 Median : 85.0
## Mean : 122.9 Mean : 124.4 Mean : 103.3
## 3rd Qu.: 154.0 3rd Qu.: 155.0 3rd Qu.: 137.0
## Max. :1777.0 Max. :1441.0 Max. : 3508.0
##
## ArrDelay DepDelay Origin Dest
## Min. :-1302.000 Min. :-1410.000 Length:13363418 Length:13363418
## 1st Qu.: -10.000 1st Qu.: -4.000 Class :character Class :character
## Median : -2.000 Median : 0.000 Mode :character Mode :character
## Mean : 5.119 Mean : 6.601
## 3rd Qu.: 9.000 3rd Qu.: 4.000
## Max. : 1879.000 Max. : 1882.000
##
## Distance TaxiIn TaxiOut Cancelled
## Min. : 8 Min. : 0.000 Min. : 0.00 Min. :0
## 1st Qu.: 305 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0
## Median : 547 Median : 5.000 Median : 13.00 Median :0
## Mean : 715 Mean : 7.474 Mean : 15.67 Mean :0
## 3rd Qu.: 946 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0
## Max. :4962 Max. :1495.000 Max. :3905.00 Max. :0
##
## CancellationCode Diverted CarrierDelay WeatherDelay
## Length:13363418 Min. :0 Min. : 0.0 Min. : 0.0
## Class :character 1st Qu.:0 1st Qu.: 0.0 1st Qu.: 0.0
## Mode :character Median :0 Median : 0.0 Median : 0.0
## Mean :0 Mean : 2.6 Mean : 0.7
## 3rd Qu.:0 3rd Qu.: 0.0 3rd Qu.: 0.0
## Max. :0 Max. :1879.0 Max. :1230.0
## NA's :2619866 NA's :2619866
## NASDelay SecurityDelay LateAircraftDelay
## Min. : -60.0 Min. : 0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0.0
## Median : 0.0 Median : 0 Median : 0.0
## Mean : 3.4 Mean : 0 Mean : 3.2
## 3rd Qu.: 0.0 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :1385.0 Max. :533 Max. :1407.0
## NA's :2619866 NA's :2619866 NA's :2619866
When is the best time of day, day of the week, and time of year to fly to minimise delays?
# Creating range of times
flight <- flight %>%
mutate(
new_dep = case_when(
between(DepTime, 1, 300) ~ "12 AM - 3 AM",
between(DepTime, 301, 600) ~ "3 AM - 6 AM",
between(DepTime, 601, 900) ~ "6 AM - 9 AM",
between(DepTime, 901, 1200) ~ "9 AM - 12 PM",
between(DepTime, 1201, 1500) ~ "12 PM - 3 PM",
between(DepTime, 1501, 1800) ~ "3 PM - 6 PM",
between(DepTime, 1801, 2100) ~ "6 PM - 9 PM",
between(DepTime, 2101, 2400) ~ "9 PM - 12 AM",
DepTime > 2400 ~ "12 AM - 3 AM"
))
# Filtering, grouping, and finding an average of arrival delay and delay counts per time interval
best_time_of_day <- flight %>%
filter(ArrDelay > 0) %>%
group_by(new_dep) %>%
summarise(avg_delay_in_mins = round(mean(ArrDelay),2),
delay_counts = n()) %>%
arrange(avg_delay_in_mins)
best_time_of_day$new_dep <- factor(best_time_of_day$new_dep, levels = c("12 AM - 3 AM","3 AM - 6 AM", "6 AM - 9 AM", "9 AM - 12 PM", "12 PM - 3 PM", "3 PM - 6 PM", "6 PM - 9 PM", "9 PM - 12 AM"))
best_time_of_day
## # A tibble: 8 x 3
## new_dep avg_delay_in_mins delay_counts
## <fct> <dbl> <int>
## 1 3 AM - 6 AM 12.5 50906
## 2 6 AM - 9 AM 15.1 807703
## 3 9 AM - 12 PM 20.0 963895
## 4 12 PM - 3 PM 23.0 1037391
## 5 3 PM - 6 PM 27.2 1155089
## 6 6 PM - 9 PM 34.3 1029275
## 7 9 PM - 12 AM 48.5 385752
## 8 12 AM - 3 AM 84.3 29772
# Viz Best time of Day
best_time_of_day %>%
ggplot() +
aes(x = new_dep,
y = avg_delay_in_mins,
fill = avg_delay_in_mins) +
xlab(NULL) +
ylab("Avg Delay (min)") +
scale_fill_gradient(low = "#1DB0CA", high = "#232971") +
labs(title = "Average Flight Delay in 2003-2004",
subtitle = "(per 3 hours)",
caption = "Figure 1") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0.5),
legend.position = "none") +
geom_col() +
geom_text(aes(label = avg_delay_in_mins), vjust = 1.4, colour = "white")
# Filtering, grouping, and finding an average arrival delay and delay count for each day of week
best_day_of_week <- flight %>%
filter(ArrDelay > 0) %>%
group_by(DayOfWeek) %>%
summarise(avg_delay_in_min = round(mean(ArrDelay),2),
delay_count = n()) %>%
mutate(DayOfWeek = case_when(
DayOfWeek == 1 ~ "Monday",
DayOfWeek == 2 ~ "Tuesday",
DayOfWeek == 3 ~ "Wednesday",
DayOfWeek == 4 ~ "Thursday",
DayOfWeek == 5 ~ "Friday",
DayOfWeek == 6 ~ "Saturday",
DayOfWeek == 7 ~ "Sunday"
)) %>%
arrange(avg_delay_in_min)
best_day_of_week$DayOfWeek <- factor(best_day_of_week$DayOfWeek, levels = c("Sunday", "Saturday", "Friday", "Thursday", "Wednesday", "Tuesday", "Monday"))
best_day_of_week
## # A tibble: 7 x 3
## DayOfWeek avg_delay_in_min delay_count
## <fct> <dbl> <int>
## 1 Saturday 23.3 600173
## 2 Tuesday 25.5 755547
## 3 Wednesday 26.2 796757
## 4 Friday 26.6 883602
## 5 Thursday 27.0 860557
## 6 Sunday 27.2 750076
## 7 Monday 27.6 813071
# Plotting average flight delay for every day of week
best_day_of_week %>%
ggplot() +
aes(x = DayOfWeek,
y = avg_delay_in_min,
fill = DayOfWeek) +
xlab(NULL) +
ylab("Avg Delay (min)") +
ylim(c(0,28)) +
labs(title = "Average Flight Delay in 2003-2004",
subtitle = "(per day)",
caption = "Figure 2") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0.5),
legend.position = "none") +
geom_col() +
geom_text(aes(label = avg_delay_in_min), hjust = 1.2) +
coord_flip()
# Creating a new column called quarter to get the best time of the year
flight <- flight %>%
mutate(quarter = case_when(
Month %in% c(1,2,3) ~ "Q1",
Month %in% c(4,5,6) ~ "Q2",
Month %in% c(7,8,9) ~ "Q3",
Month %in% c(10,11,12) ~ "Q4"
))
# Separate best time of year in 2003 and 2004
best_time_of_2003 <- flight %>%
filter(ArrDelay > 0 & Year == 2003) %>%
group_by(Year, quarter) %>%
summarise(avg_delay_in_min = round(mean(ArrDelay),2),
delay_count = n())
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
best_time_of_2004 <- flight %>%
filter(ArrDelay > 0 & Year == 2004) %>%
group_by(Year, quarter) %>%
summarise(avg_delay_in_min = round(mean(ArrDelay),2),
delay_count = n())
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
# Preview of best quarter in 2003
best_time_of_2003
## # A tibble: 4 x 4
## # Groups: Year [1]
## Year quarter avg_delay_in_min delay_count
## <int> <chr> <dbl> <int>
## 1 2003 Q1 24.9 596997
## 2 2003 Q2 22.9 557582
## 3 2003 Q3 27.1 625177
## 4 2003 Q4 24.1 661585
# Preview of best quarter in 2004
best_time_of_2004
## # A tibble: 4 x 4
## # Groups: Year [1]
## Year quarter avg_delay_in_min delay_count
## <int> <chr> <dbl> <int>
## 1 2004 Q1 26.4 737006
## 2 2004 Q2 28.9 760521
## 3 2004 Q3 28.2 726691
## 4 2004 Q4 27.0 794224
# Creating a line graph for best time in 2003 and 2004
ggplot(best_time_of_2003, aes(x = quarter,
y = avg_delay_in_min,
group = 1)) +
geom_line(color = "darkblue") +
geom_line(aes(best_time_of_2004$quarter, best_time_of_2004$avg_delay_in_min, group = 1), color = "darkred", linetype = "dashed") +
xlab(NULL) +
ylab("Avg Delay (min)") +
labs(title = "Average Flight Delay in 2003-2004",
subtitle = "(quarterly)",
caption = "Figure 3") +
geom_text(aes(label = avg_delay_in_min), hjust = 1.2) +
geom_text(aes(best_time_of_2004$quarter, best_time_of_2004$avg_delay_in_min, label = best_time_of_2004$avg_delay_in_min), hjust = 1.2) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0.5)) +
scale_color_manual(name="Year",
breaks=c("2003","2004"),
values=c('2003'='darkblue', '2004'='darkred'))
Do older planes suffer more delays?
# Removing NAs
plane <- na.omit(plane)
head(plane)
## tailnum type manufacturer issue_date model status
## 35 N10156 Corporation EMBRAER 02/13/2004 EMB-145XR Valid
## 36 N102UW Corporation AIRBUS INDUSTRIE 05/26/1999 A320-214 Valid
## 37 N10323 Corporation BOEING 07/01/1997 737-3TO Valid
## 38 N103US Corporation AIRBUS INDUSTRIE 06/18/1999 A320-214 Valid
## 39 N104UA Corporation BOEING 01/26/1998 747-422 Valid
## 40 N104UW Corporation AIRBUS INDUSTRIE 07/02/1999 A320-214 Valid
## aircraft_type engine_type year
## 35 Fixed Wing Multi-Engine Turbo-Fan 2004
## 36 Fixed Wing Multi-Engine Turbo-Fan 1998
## 37 Fixed Wing Multi-Engine Turbo-Jet 1986
## 38 Fixed Wing Multi-Engine Turbo-Fan 1999
## 39 Fixed Wing Multi-Engine Turbo-Fan 1998
## 40 Fixed Wing Multi-Engine Turbo-Fan 1999
# Merge the flight and plane table to see the average delay of every plane sorted by its manufacturing year
older_delays <- flight %>%
left_join(plane, by = c("TailNum" = "tailnum")) %>%
filter(ArrDelay > 0) %>%
group_by(year, TailNum) %>%
summarise(avg_delay = mean(ArrDelay),
delay_counts = n()) %>%
arrange(year)
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
head(older_delays)
## # A tibble: 6 x 4
## # Groups: year [4]
## year TailNum avg_delay delay_counts
## <chr> <chr> <dbl> <int>
## 1 0000 N235SW 16.6 1745
## 2 0000 N384AE 23.8 385
## 3 1956 N381AA 27.9 278
## 4 1957 N3744D 19.9 801
## 5 1959 N201AA 29.5 852
## 6 1959 N567AA 31.1 838
# Filtering valid manufacturing years for planes that fly in 2003-2004
older_delays <- na.omit(older_delays) %>%
filter(year != "None" & year != "0000" & year <= "2004")
older_delays$year <- as.integer(older_delays$year)
head(older_delays)
## # A tibble: 6 x 4
## # Groups: year [5]
## year TailNum avg_delay delay_counts
## <int> <chr> <dbl> <int>
## 1 1956 N381AA 27.9 278
## 2 1957 N3744D 19.9 801
## 3 1959 N201AA 29.5 852
## 4 1959 N567AA 31.1 838
## 5 1962 N421AA 32.1 960
## 6 1963 N378AA 30.0 297
# Create a Scatter Plot for every plane in every year; If the older planes suffer more delay, there should be a negative linear relationship. However it does not show like that.
older_delays %>%
ggplot() +
aes(x = year,
y = avg_delay,
color = year) +
scale_color_steps(low = "#9D43C2", high = "#482158") +
xlab(NULL) +
ylab("Avg Delay (Min)") +
labs(title = "Scatter Plot of Average Flight Delay by Year",
caption = "Figure 5") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.caption = element_text(hjust = 0.5),
legend.position = "none") +
geom_jitter()
How does the number of people flying between different locations change over time?
# Creating a new column that combines origin and destination as a route
flight <- flight %>%
mutate(org_dest = paste(Origin, Dest, sep = "-"))
# Counting the number of flights per origin-destination, separated by years
org_dest_2003 <- flight %>%
filter(Year == 2003) %>%
group_by(org_dest) %>%
summarise(total_flight_2003 = n()) %>%
arrange()
org_dest_2004 <- flight %>%
filter(Year == 2004) %>%
group_by(org_dest) %>%
summarise(total_flight_2004 = n()) %>%
arrange()
# Taking the difference between both years, then retrieve the result which has the greatest positive & negative change
more_2003 <- org_dest_2003 %>%
inner_join(org_dest_2004, by = c("org_dest" = "org_dest")) %>%
mutate(difference = total_flight_2004-total_flight_2003,
perc_change = round((difference/total_flight_2003),6)) %>%
arrange(desc(perc_change)) %>%
head(10)
more_2004 <- org_dest_2003 %>%
inner_join(org_dest_2004, by = c("org_dest" = "org_dest")) %>%
mutate(difference = total_flight_2004-total_flight_2003,
perc_change = round((difference/total_flight_2003),6)) %>%
arrange(perc_change) %>%
head(10)
more_2003
## # A tibble: 10 x 5
## org_dest total_flight_2003 total_flight_2004 difference perc_change
## <chr> <int> <int> <int> <dbl>
## 1 TLH-IAH 1 464 463 463
## 2 RSW-MCO 1 346 345 345
## 3 SGF-MEM 1 305 304 304
## 4 DFW-PIA 2 334 332 166
## 5 OKC-DTW 2 205 203 102.
## 6 DFW-MYR 1 90 89 89
## 7 PHL-AVP 3 252 249 83
## 8 AVP-PHL 4 258 254 63.5
## 9 CVG-CAK 41 2571 2530 61.7
## 10 CAK-CVG 41 2546 2505 61.1
more_2004
## # A tibble: 10 x 5
## org_dest total_flight_2003 total_flight_2004 difference perc_change
## <chr> <int> <int> <int> <dbl>
## 1 CDC-SGU 128 1 -127 -0.992
## 2 STL-SAT 1129 14 -1115 -0.988
## 3 SAT-STL 1127 14 -1113 -0.988
## 4 DTW-SJC 307 4 -303 -0.987
## 5 SLC-AUS 1044 15 -1029 -0.986
## 6 SLC-SAT 663 10 -653 -0.985
## 7 SJC-DTW 306 5 -301 -0.984
## 8 TUL-OKC 364 6 -358 -0.984
## 9 AUS-SLC 1036 18 -1018 -0.983
## 10 OKC-TUL 365 7 -358 -0.981
# Comparing by Origin
origin_03 <- flight %>%
filter(Year == 2003) %>%
group_by(Origin) %>%
summarise(total_flight_03 = n())
origin_04 <- flight %>%
filter(Year == 2004) %>%
group_by(Origin) %>%
summarise(total_flight_04 = n())
origin_comparison <- origin_03 %>%
inner_join(origin_04, by = c("Origin" = "Origin")) %>%
left_join(airports, by = c("Origin" = "iata")) %>%
select(Origin, long, lat, total_flight_03, total_flight_04) %>%
mutate(perc_diff = round(((total_flight_04-total_flight_03)/total_flight_03),2)) %>%
arrange(perc_diff)
head(origin_comparison,10)
## # A tibble: 10 x 6
## Origin long lat total_flight_03 total_flight_04 perc_diff
## <chr> <dbl> <dbl> <int> <int> <dbl>
## 1 DUT -167. 53.9 511 37 -0.93
## 2 DRO -108. 37.2 1519 444 -0.71
## 3 FMN -108. 36.7 2 1 -0.5
## 4 STL -90.4 38.7 99485 60308 -0.39
## 5 STX -64.8 17.7 622 399 -0.36
## 6 EFD -95.2 29.6 1537 1034 -0.33
## 7 ILE -97.7 31.1 4366 2904 -0.33
## 8 MEI -88.8 32.3 1506 1032 -0.31
## 9 WYS -111. 44.7 321 221 -0.31
## 10 CDC -113. 37.7 1048 737 -0.3
tail(origin_comparison)
## # A tibble: 6 x 6
## Origin long lat total_flight_03 total_flight_04 perc_diff
## <chr> <dbl> <dbl> <int> <int> <dbl>
## 1 LNY -157. 20.8 57 210 2.68
## 2 MKK -157. 21.2 57 210 2.68
## 3 AVP -75.7 41.3 806 3026 2.75
## 4 ITO -155. 19.7 357 2520 6.06
## 5 ACY -74.6 39.5 127 1056 7.31
## 6 ERI -80.2 42.1 59 1033 16.5
# Comparing by Destination
dest_03 <- flight %>%
filter(Year == 2003) %>%
group_by(Dest) %>%
summarise(total_flight_03 = n())
dest_04 <- flight %>%
filter(Year == 2004) %>%
group_by(Dest) %>%
summarise(total_flight_04 = n())
dest_comparison <- dest_03 %>%
inner_join(dest_04, by = c("Dest" = "Dest")) %>%
left_join(airports, by = c("Dest" = "iata")) %>%
select(Dest, long, lat, total_flight_03, total_flight_04) %>%
mutate(perc_diff = round(((total_flight_04-total_flight_03)/total_flight_03),2)) %>%
arrange(perc_diff)
head(dest_comparison,10)
## # A tibble: 10 x 6
## Dest long lat total_flight_03 total_flight_04 perc_diff
## <chr> <dbl> <dbl> <int> <int> <dbl>
## 1 DUT -167. 53.9 522 38 -0.93
## 2 DRO -108. 37.2 1521 449 -0.7
## 3 STL -90.4 38.7 99481 60339 -0.39
## 4 STX -64.8 17.7 625 399 -0.36
## 5 ILE -97.7 31.1 4374 2906 -0.34
## 6 EFD -95.2 29.6 1533 1032 -0.33
## 7 CDC -113. 37.7 1048 728 -0.31
## 8 MEI -88.8 32.3 1516 1044 -0.31
## 9 WYS -111. 44.7 322 225 -0.3
## 10 GTR -88.6 33.5 1931 1379 -0.29
tail(dest_comparison)
## # A tibble: 6 x 6
## Dest long lat total_flight_03 total_flight_04 perc_diff
## <chr> <dbl> <dbl> <int> <int> <dbl>
## 1 MKK -157. 21.2 57 209 2.67
## 2 LNY -157. 20.8 60 222 2.7
## 3 AVP -75.7 41.3 816 3037 2.72
## 4 ITO -155. 19.7 357 2523 6.07
## 5 ACY -74.6 39.5 126 1060 7.41
## 6 ERI -80.2 42.1 61 1038 16.0
# Plotting the difference for origin
geo <- list(
scope = 'usa',
projection = list(type = 'world'),
showland = TRUE,
landcolor = toRGB("gray95"),
countrycolor = toRGB("gray80")
)
plot_geo(locationmode = 'USA-states') %>%
add_markers(
data=origin_comparison, x = ~long, y = ~lat, text = ~paste("Origin :", Origin,
"<br> Difference (decimal) :", perc_diff),
color = ~perc_diff, hoverinfo = "text",alpha = 0.7, marker = list(size = 10)) %>%
layout(
title = 'Origin Comparison between 2003-2004',
geo = geo
)
# Plotting the difference for destination
plot_geo(locationmode = 'USA-states') %>%
add_markers(
data = dest_comparison, x = ~long, y = ~lat, text = ~paste("Destination :", Dest,
"<br> Difference (decimal):", perc_diff),
color = ~perc_diff, hoverinfo = "text", alpha = 0.7, marker = list(size = 10)) %>%
layout(
title = 'Destination Comparison between 2003-2004',
geo = geo
)
Can you detect cascading failures as delays in one airport create delays in others?
# Retrieving the top 5 routes
flight %>%
group_by(org_dest) %>%
summarise(counts = n()) %>%
arrange(desc(counts)) %>%
top_n(5)
## Selecting by counts
## # A tibble: 5 x 2
## org_dest counts
## <chr> <int>
## 1 LAX-SAN 29588
## 2 SAN-LAX 29524
## 3 BOS-LGA 25063
## 4 LGA-BOS 24561
## 5 LGA-DCA 24327
# Taking example from airports that have the most flights
flight %>%
filter(ArrDelay > 60 & (Origin %in% c("LAX","SAN") | Dest %in% c("LAX", "SAN"))) %>%
select(Year, Month, DayofMonth, TailNum, CRSDepTime, DepTime, CRSArrTime, ArrTime, DepDelay, ArrDelay, Origin, Dest) %>%
arrange(Year, Month, DayofMonth, TailNum, DepTime) %>%
head(10)
## Year Month DayofMonth TailNum CRSDepTime DepTime CRSArrTime ArrTime DepDelay
## 1 2003 1 1 N1608 1020 1131 1735 1840 71
## 2 2003 1 1 N299SW 2327 2313 11 2351 0
## 3 2003 1 1 N302AA 2245 2357 620 754 72
## 4 2003 1 1 N339MQ 1310 1509 1405 1610 119
## 5 2003 1 1 N342 1630 1755 1740 1858 85
## 6 2003 1 1 N342 1810 1925 1910 2025 75
## 7 2003 1 1 N37277 1415 1524 2210 2327 69
## 8 2003 1 1 N450UA 815 1018 1028 1210 123
## 9 2003 1 1 N450UA 1124 1257 1715 1839 93
## 10 2003 1 1 N514UA 1605 1935 1913 2205 210
## ArrDelay Origin Dest
## 1 65 LAX ATL
## 2 1420 LAX PSP
## 3 94 LAX IAD
## 4 125 FAT LAX
## 5 78 LAS LAX
## 6 75 LAX LAS
## 7 77 SAN EWR
## 8 102 ORD SAN
## 9 84 SAN ORD
## 10 172 EWR LAX
Use the available variables to construct a model that predicts delays.
# Skimming the dataset
skimr::skim(flight)
| Name | flight |
| Number of rows | 13363418 |
| Number of columns | 32 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| numeric | 24 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| UniqueCarrier | 0 | 1.0 | 2 | 2 | 0 | 19 | 0 |
| TailNum | 0 | 1.0 | 0 | 6 | 58 | 5830 | 0 |
| Origin | 0 | 1.0 | 3 | 3 | 0 | 288 | 0 |
| Dest | 0 | 1.0 | 3 | 3 | 0 | 283 | 0 |
| CancellationCode | 2619866 | 0.8 | 0 | 1 | 10743544 | 4 | 0 |
| new_dep | 0 | 1.0 | 11 | 12 | 0 | 8 | 0 |
| quarter | 0 | 1.0 | 2 | 2 | 0 | 4 | 0 |
| org_dest | 0 | 1.0 | 7 | 7 | 0 | 4852 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Year | 0 | 1.0 | 2003.52 | 0.50 | 2003 | 2003 | 2004 | 2004 | 2004 | ▇▁▁▁▇ |
| Month | 0 | 1.0 | 6.54 | 3.44 | 1 | 4 | 7 | 10 | 12 | ▇▅▆▅▇ |
| DayofMonth | 0 | 1.0 | 15.75 | 8.79 | 1 | 8 | 16 | 23 | 31 | ▇▇▇▇▆ |
| DayOfWeek | 0 | 1.0 | 3.94 | 1.99 | 1 | 2 | 4 | 6 | 7 | ▇▅▅▅▇ |
| DepTime | 0 | 1.0 | 1345.61 | 469.78 | 1 | 938 | 1330 | 1732 | 2750 | ▁▇▇▆▁ |
| CRSDepTime | 0 | 1.0 | 1340.35 | 460.10 | 0 | 935 | 1328 | 1725 | 2400 | ▁▇▇▇▃ |
| ArrTime | 0 | 1.0 | 1500.32 | 488.91 | 1 | 1123 | 1524 | 1915 | 2955 | ▁▆▇▇▁ |
| CRSArrTime | 0 | 1.0 | 1506.24 | 473.36 | 0 | 1128 | 1526 | 1910 | 2400 | ▁▃▇▇▆ |
| FlightNum | 0 | 1.0 | 2041.70 | 1939.24 | 1 | 585 | 1403 | 2857 | 9912 | ▇▂▁▁▁ |
| ActualElapsedTime | 0 | 1.0 | 122.91 | 70.16 | -710 | 72 | 104 | 154 | 1777 | ▁▇▁▁▁ |
| CRSElapsedTime | 0 | 1.0 | 124.39 | 69.33 | -32 | 75 | 105 | 155 | 1441 | ▇▁▁▁▁ |
| AirTime | 0 | 1.0 | 103.27 | 81.93 | -3818 | 53 | 85 | 137 | 3508 | ▁▁▇▁▁ |
| ArrDelay | 0 | 1.0 | 5.12 | 31.96 | -1302 | -10 | -2 | 9 | 1879 | ▁▁▇▁▁ |
| DepDelay | 0 | 1.0 | 6.60 | 28.11 | -1410 | -4 | 0 | 4 | 1882 | ▁▁▇▁▁ |
| Distance | 0 | 1.0 | 714.99 | 569.48 | 8 | 305 | 547 | 946 | 4962 | ▇▂▁▁▁ |
| TaxiIn | 0 | 1.0 | 7.47 | 41.22 | 0 | 4 | 5 | 7 | 1495 | ▇▁▁▁▁ |
| TaxiOut | 0 | 1.0 | 15.67 | 13.13 | 0 | 10 | 13 | 18 | 3905 | ▇▁▁▁▁ |
| Cancelled | 0 | 1.0 | 0.00 | 0.00 | 0 | 0 | 0 | 0 | 0 | ▁▁▇▁▁ |
| Diverted | 0 | 1.0 | 0.00 | 0.00 | 0 | 0 | 0 | 0 | 0 | ▁▁▇▁▁ |
| CarrierDelay | 2619866 | 0.8 | 2.55 | 16.17 | 0 | 0 | 0 | 0 | 1879 | ▇▁▁▁▁ |
| WeatherDelay | 2619866 | 0.8 | 0.65 | 8.39 | 0 | 0 | 0 | 0 | 1230 | ▇▁▁▁▁ |
| NASDelay | 2619866 | 0.8 | 3.38 | 14.73 | -60 | 0 | 0 | 0 | 1385 | ▇▁▁▁▁ |
| SecurityDelay | 2619866 | 0.8 | 0.02 | 1.21 | 0 | 0 | 0 | 0 | 533 | ▇▁▁▁▁ |
| LateAircraftDelay | 2619866 | 0.8 | 3.22 | 16.24 | 0 | 0 | 0 | 0 | 1407 | ▇▁▁▁▁ |
# First, we select only numerical variables that are available even before the flight departs (so it can be used for future predictions)
flight_selected <- flight %>%
select(Month, DayofMonth, DayOfWeek, CRSDepTime, CRSArrTime, CRSElapsedTime, Distance, ArrDelay)
# Predicting ArrDelay
# Create a task before doing regression
task <- TaskRegr$new(id = "flight_selected", backend = flight_selected, target = 'ArrDelay')
# Take MSE as measure
measure <- msr("regr.mse")
# Splitting the data into train and test
set.seed(3005)
train_data <- sample(task$nrow, 0.7 * task$nrow)
test_data <- setdiff(seq_len(task$nrow), train_data)
# Set the learner to ridge regression, train the dataset and use it to predict the test dataset
learner_ridge <- lrn('regr.glmnet')
learner_ridge$param_set$values <- list(alpha = 0, lambda = 0.1)
glrn_ridge <- GraphLearner$new(learner_ridge)
glrn_ridge$train(task, row_ids = train_data)
# Ridge Train data result
glrn_ridge$predict(task, row_ids = train_data)$score()
## regr.mse
## 1007.176
# Ridge Test data result
glrn_ridge$predict(task, row_ids = test_data)$score()
## regr.mse
## 1005.817
# Set the learner to LASSO regression, train the dataset and use it to predict the test dataset
learner_lasso <- lrn('regr.glmnet')
learner_lasso$param_set$values <- list(alpha = 1, lambda = 0.1)
glrn_lasso <- GraphLearner$new(learner_lasso)
glrn_lasso$train(task, row_ids = train_data)
# LASSO Train data result
glrn_lasso$predict(task, row_ids = train_data)$score()
## regr.mse
## 1007.222
# LASSO Test data result
glrn_lasso$predict(task, row_ids = test_data)$score()
## regr.mse
## 1005.873
# Take a sample of 50,000 rows
set.seed(2002)
flight_sample <- sample_n(flight, 50000)
# Selecting numerical variables from the sample dataset
flight_sample_selected <- flight_sample %>%
select(Month, DayofMonth, DayOfWeek, CRSDepTime, CRSArrTime, CRSElapsedTime, Distance, ArrDelay)
# Replacing the task with the sample dataset
task <- TaskRegr$new(id = "flight_sample_selected", backend = flight_sample_selected, target = 'ArrDelay')
# Splitting the data into train and test
set.seed(3005)
train_data <- sample(task$nrow, 0.7 * task$nrow)
test_data <- setdiff(seq_len(task$nrow), train_data)
# Set the learner to random forest, train the dataset and use it to predict the test dataset
learner_rf <- lrn('regr.ranger')
learner_rf$param_set$values <- list(min.node.size = 4)
glrn_rf <- GraphLearner$new(learner_rf)
tuner <- tnr('grid_search')
terminator <- trm('evals', n_evals = 30)
tune_ntrees <- ParamSet$new(
list(
ParamInt$new('regr.ranger.num.trees', lower = 100, upper = 500)
)
)
autotune_rf <- AutoTuner$new(
learner = glrn_rf,
resampling = rsmp('cv', folds = 5),
measure = measure,
search_space = tune_ntrees,
terminator = terminator,
tuner = tuner
)
autotune_rf$train(task, row_ids = train_data)
## INFO [10:35:15.148] [bbotk] Starting to optimize 1 parameter(s) with '<TunerGridSearch>' and '<TerminatorEvals> [n_evals=30, k=0]'
## INFO [10:35:15.219] [bbotk] Evaluating 1 configuration(s)
## INFO [10:35:15.280] [mlr3] Running benchmark with 5 resampling iterations
## INFO [10:35:15.498] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5)
## INFO [10:35:20.639] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5)
## INFO [10:35:25.319] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5)
## INFO [10:35:30.165] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5)
## INFO [10:35:34.976] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5)
## INFO [10:35:39.994] [mlr3] Finished benchmark
## INFO [10:35:40.087] [bbotk] Result of batch 1:
## INFO [10:35:40.096] [bbotk] regr.ranger.num.trees regr.mse warnings errors runtime_learners
## INFO [10:35:40.096] [bbotk] 233 1000.593 0 0 24.37
## INFO [10:35:40.096] [bbotk] uhash
## INFO [10:35:40.096] [bbotk] df97f0f5-532e-4052-9e04-a027907cf565
## INFO [10:35:40.099] [bbotk] Evaluating 1 configuration(s)
## INFO [10:35:40.212] [mlr3] Running benchmark with 5 resampling iterations
## INFO [10:35:40.237] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5)
## INFO [10:35:47.073] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5)
## INFO [10:35:54.617] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5)
## INFO [10:36:01.667] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5)
## INFO [10:36:09.071] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5)
## INFO [10:36:16.570] [mlr3] Finished benchmark
## INFO [10:36:16.679] [bbotk] Result of batch 2:
## INFO [10:36:16.709] [bbotk] regr.ranger.num.trees regr.mse warnings errors runtime_learners
## INFO [10:36:16.709] [bbotk] 367 999.4707 0 0 36.22
## INFO [10:36:16.709] [bbotk] uhash
## INFO [10:36:16.709] [bbotk] f54343e2-4c2b-4686-bb72-8417528d09a2
## INFO [10:36:16.713] [bbotk] Evaluating 1 configuration(s)
## INFO [10:36:16.777] [mlr3] Running benchmark with 5 resampling iterations
## INFO [10:36:16.799] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5)
## INFO [10:36:29.573] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5)
## INFO [10:36:39.097] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5)
## INFO [10:36:48.830] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5)
## INFO [10:36:58.696] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5)
## INFO [10:37:08.116] [mlr3] Finished benchmark
## INFO [10:37:08.202] [bbotk] Result of batch 3:
## INFO [10:37:08.206] [bbotk] regr.ranger.num.trees regr.mse warnings errors runtime_learners
## INFO [10:37:08.206] [bbotk] 456 999.2912 0 0 51.2
## INFO [10:37:08.206] [bbotk] uhash
## INFO [10:37:08.206] [bbotk] e75854b9-cf2a-485a-9774-758391d40dd7
## INFO [10:37:08.209] [bbotk] Evaluating 1 configuration(s)
## INFO [10:37:08.299] [mlr3] Running benchmark with 5 resampling iterations
## INFO [10:37:08.320] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5)
## INFO [10:37:17.802] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5)
## INFO [10:37:26.615] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5)
## INFO [10:37:35.628] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5)
## INFO [10:37:44.417] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5)
## INFO [10:37:53.243] [mlr3] Finished benchmark
## INFO [10:37:53.364] [bbotk] Result of batch 4:
## INFO [10:37:53.370] [bbotk] regr.ranger.num.trees regr.mse warnings errors runtime_learners
## INFO [10:37:53.370] [bbotk] 411 999.4785 0 0 44.84
## INFO [10:37:53.370] [bbotk] uhash
## INFO [10:37:53.370] [bbotk] 5a509288-0a59-422f-93e8-b9af5de7a041
## INFO [10:37:53.376] [bbotk] Evaluating 1 configuration(s)
## INFO [10:37:53.484] [mlr3] Running benchmark with 5 resampling iterations
## INFO [10:37:53.510] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5)
## INFO [10:37:55.561] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5)
## INFO [10:37:57.578] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5)
## INFO [10:37:59.560] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5)
## INFO [10:38:01.588] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5)
## INFO [10:38:03.628] [mlr3] Finished benchmark
## INFO [10:38:03.740] [bbotk] Result of batch 5:
## INFO [10:38:03.746] [bbotk] regr.ranger.num.trees regr.mse warnings errors runtime_learners
## INFO [10:38:03.746] [bbotk] 100 1006.07 0 0 10
## INFO [10:38:03.746] [bbotk] uhash
## INFO [10:38:03.746] [bbotk] f247da3b-92ef-4a6c-909b-4779f392567a
## INFO [10:38:03.750] [bbotk] Evaluating 1 configuration(s)
## INFO [10:38:03.846] [mlr3] Running benchmark with 5 resampling iterations
## INFO [10:38:03.864] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5)
## INFO [10:38:06.786] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5)
## INFO [10:38:09.880] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5)
## INFO [10:38:12.665] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5)
## INFO [10:38:16.397] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5)
## INFO [10:38:19.224] [mlr3] Finished benchmark
## INFO [10:38:19.334] [bbotk] Result of batch 6:
## INFO [10:38:19.340] [bbotk] regr.ranger.num.trees regr.mse warnings errors runtime_learners
## INFO [10:38:19.340] [bbotk] 144 1003.69 0 0 15.25
## INFO [10:38:19.340] [bbotk] uhash
## INFO [10:38:19.340] [bbotk] 99996684-c03d-4a90-b708-b5a9eb06a793
## INFO [10:38:19.345] [bbotk] Evaluating 1 configuration(s)
## INFO [10:38:19.437] [mlr3] Running benchmark with 5 resampling iterations
## INFO [10:38:19.456] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5)
## INFO [10:38:25.724] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5)
## INFO [10:38:31.740] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5)
## INFO [10:38:37.772] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5)
## INFO [10:38:43.468] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5)
## INFO [10:38:49.290] [mlr3] Finished benchmark
## INFO [10:38:49.410] [bbotk] Result of batch 7:
## INFO [10:38:49.416] [bbotk] regr.ranger.num.trees regr.mse warnings errors runtime_learners
## INFO [10:38:49.416] [bbotk] 278 999.7746 0 0 29.63
## INFO [10:38:49.416] [bbotk] uhash
## INFO [10:38:49.416] [bbotk] 15fed893-79b9-46da-af52-85d83350c773
## INFO [10:38:49.461] [bbotk] Evaluating 1 configuration(s)
## INFO [10:38:49.566] [mlr3] Running benchmark with 5 resampling iterations
## INFO [10:38:49.586] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5)
## INFO [10:39:00.432] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5)
## INFO [10:39:11.459] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5)
## INFO [10:39:21.732] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5)
## INFO [10:39:32.432] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5)
## INFO [10:39:42.250] [mlr3] Finished benchmark
## INFO [10:39:42.400] [bbotk] Result of batch 8:
## INFO [10:39:42.404] [bbotk] regr.ranger.num.trees regr.mse warnings errors runtime_learners
## INFO [10:39:42.404] [bbotk] 500 998.6334 0 0 52.5
## INFO [10:39:42.404] [bbotk] uhash
## INFO [10:39:42.404] [bbotk] bb0ec28c-3365-4691-9c81-fe15c35473e5
## INFO [10:39:42.407] [bbotk] Evaluating 1 configuration(s)
## INFO [10:39:42.469] [mlr3] Running benchmark with 5 resampling iterations
## INFO [10:39:42.482] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5)
## INFO [10:39:49.033] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5)
## INFO [10:39:55.035] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5)
## INFO [10:40:03.525] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5)
## INFO [10:40:12.041] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5)
## INFO [10:40:18.357] [mlr3] Finished benchmark
## INFO [10:40:18.430] [bbotk] Result of batch 9:
## INFO [10:40:18.434] [bbotk] regr.ranger.num.trees regr.mse warnings errors runtime_learners
## INFO [10:40:18.434] [bbotk] 322 999.5913 0 0 35.77
## INFO [10:40:18.434] [bbotk] uhash
## INFO [10:40:18.434] [bbotk] ff8b78d4-627d-4660-baf5-5db393e4dd31
## INFO [10:40:18.436] [bbotk] Evaluating 1 configuration(s)
## INFO [10:40:18.487] [mlr3] Running benchmark with 5 resampling iterations
## INFO [10:40:18.500] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5)
## INFO [10:40:22.345] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5)
## INFO [10:40:25.925] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5)
## INFO [10:40:29.495] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5)
## INFO [10:40:33.020] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5)
## INFO [10:40:36.753] [mlr3] Finished benchmark
## INFO [10:40:36.836] [bbotk] Result of batch 10:
## INFO [10:40:36.840] [bbotk] regr.ranger.num.trees regr.mse warnings errors runtime_learners
## INFO [10:40:36.840] [bbotk] 189 1002.084 0 0 18.1
## INFO [10:40:36.840] [bbotk] uhash
## INFO [10:40:36.840] [bbotk] 92511b9e-2358-4c6b-8dd9-659d4d0bb9b8
## INFO [10:40:36.856] [bbotk] Finished optimizing after 10 evaluation(s)
## INFO [10:40:36.858] [bbotk] Result:
## INFO [10:40:36.861] [bbotk] regr.ranger.num.trees learner_param_vals x_domain regr.mse
## INFO [10:40:36.861] [bbotk] 500 <list[2]> <list[1]> 998.6334
# Random Forest Train data result
autotune_rf$predict(task, row_ids = train_data)$score()
## regr.mse
## 232.4166
# Random Forest Test data result
autotune_rf$predict(task, row_ids = test_data)$score()
## regr.mse
## 985.8778